clear all
set more off
set type double, perm
capture log close

global d = 0

if $d == 0 { 
global topdir "YOUR DIRECTORY"
global datadir "${topdir}/data"
global outdir "${topdir}/output"
cd "${outdir}"
}

use sample group weight race female aafqt aafqt_belowp50 aafqt_abovep50 abovep50 belowp50 ln_wage using "${datadir}/nlsy_final.dta", clear


log using nlsy_yitzhaki_nonlinear.log, replace


/*  ------------------------------------------------------------------------  */
/*                        Residualization                                     */
/*  ------------------------------------------------------------------------  */

// residualize AFQT and log wage
gen res1_aafqt = .
gen res1_ln_wage = .
gen res2_aafqt = .
gen res2_ln_wage = .

foreach g in WM79 WM97 BM79 BM97 HM79 HM97 WW79 WW97 BW79 BW97 HW79 HW97 {
	
reg aafqt_belowp50 aafqt_abovep50 abovep50 if group=="`g'" [w=weight]
predict res if e(sample), resid
replace res1_aafqt = res if group=="`g'"
sum aafqt if abovep50==0 & group=="`g'" [w=weight]
replace res1_aafqt = res1_aafqt + r(mean) if group=="`g'" // add mean back
drop res
reg aafqt_abovep50 aafqt_belowp50 abovep50 if group=="`g'" [w=weight]
predict res if e(sample), resid
replace res2_aafqt = res if group=="`g'"
sum aafqt if abovep50==1 & group=="`g'" [w=weight]
replace res2_aafqt = res2_aafqt + r(mean) if group=="`g'" // add mean back
drop res

reg ln_wage aafqt_abovep50 abovep50 if group=="`g'" [w=weight]
predict res if e(sample), resid
replace res1_ln_wage = res if group=="`g'"
sum ln_wage if abovep50==0 & group=="`g'" [w=weight]
replace res1_ln_wage = res1_ln_wage + r(mean) if group=="`g'" // add mean back
drop res
reg ln_wage aafqt_belowp50 abovep50 if group=="`g'" [w=weight]
predict res if e(sample), resid
replace res2_ln_wage = res if group=="`g'"
sum ln_wage if abovep50==1 & group=="`g'" [w=weight]
replace res2_ln_wage = res2_ln_wage + r(mean) if group=="`g'" // add mean back
drop res

}

replace res1_aafqt = round(res1_aafqt,.01)
replace res2_aafqt = round(res2_aafqt,.01)

// binning
gen res1_aafqt_bin3 = 3*ceil(res1_aafqt/3) - 1
gen res2_aafqt_bin3 = 3*ceil(res2_aafqt/3) - 1


// ols
foreach g in WM79 WM97 WW79 WW97 {
	reg ln_wage aafqt if abovep50==0 & group=="`g'" [w=weight]
	scalar ols_wt_`g'_belowp50 = round(_b[aafqt],0.001)
	reg ln_wage aafqt if abovep50==1 & group=="`g'" [w=weight]
	scalar ols_wt_`g'_abovep50 = round(_b[aafqt],0.001)
}

drop aafqt ln_wage

tempfile basedata
save `basedata'



foreach y in res1 res2 {
	
	
/*  ------------------------------------------------------------------------  */
/*               Data Preparation for Yitzhaki Decomposition                  */
/*  ------------------------------------------------------------------------  */

use `basedata', clear

ren `y'_aafqt aafqt 
ren `y'_aafqt_bin3 aafqt_bin3
ren `y'_ln_wage ln_wage

// group data 
** by AAFQT
sort sample race female aafqt
by sample race female aafqt: egen wt_groupsize = total(weight) // group size including sample weight
by sample race female aafqt: egen sum_ln_wage = total(ln_wage*weight)
gen mean_ln_wage = sum_ln_wage/wt_groupsize // weighted avg log wage

** by AAFQT bin3
sort sample race female aafqt_bin3
by sample race female aafqt_bin3: egen wt_groupsize_bin3 = total(weight) // group size including sample weight
by sample race female aafqt_bin3: egen sum_ln_wage_bin3 = total(ln_wage*weight)
gen mean_bin3_ln_wage = sum_ln_wage/wt_groupsize // weighted avg log wage

// tag
egen tag = tag(sample race female aafqt)

// save
sort sample race female aafqt
tempfile rawdata
save `rawdata'



/*  ------------------------------------------------------------------------  */
/*               Decomposition by Race/Ethnicity and Gender                   */
/*  ------------------------------------------------------------------------  */

// collapse
use `rawdata'
keep if tag==1
keep sample race female group wt_groupsize mean_ln_wage aafqt aafqt_bin3
	
	
// calculate beta for each observation
sort sample race female aafqt
by sample race female: gen b = (mean_ln_wage[_n+1]-mean_ln_wage[_n])/(aafqt[_n+1]-aafqt[_n])


// Yitzhaki weight
// use Equation (3) of our paper (incorporating multiple obs per X and sample weights)
** term1 is the quadratic function of CDF, which reaches peak at median
** term2 is the "dispersion", i.e. the diff between the mean of above v.s. the mean of below
** term3 is deltaX, which is constant in most cases (and would be gone in continuous case)

sort sample race female aafqt

by sample race female: gen rsumN = sum(wt_groupsize) // running sum of group size ("weight")
by sample race female: egen sumN = total(wt_groupsize) // total sum of group size
by sample race female: gen rsumNh = sum(wt_groupsize*aafqt) // running sum of group size * X
by sample race female: egen sumNh = total(wt_groupsize*aafqt) // total sum of group size * X

by sample race female: gen mean_aafqt = sumNh/sumN // (weighted) sample mean
by sample race female: egen var_aafqt = total(wt_groupsize*(aafqt-mean_aafqt)^2*(1/sumN)) // (weighted) sample variance

by sample race female: gen term1 = (1/var_aafqt) * (rsumN/sumN) * ((sumN-rsumN)/sumN)
by sample race female: gen term2 = (sumNh - rsumNh)/(sumN - rsumN) - rsumNh/rsumN
by sample race female: gen term3 = aafqt[_n+1] - aafqt[_n]

gen ols_weight = term1 * term2 * term3

drop rsumN sumN rsumNh sumNh mean_aafqt var_aafqt term1 term2 term3


// cumulative contribution
sort sample race female aafqt
gen bw = b*ols_weight
by sample race female: gen ols_run_bw = sum(bw)
by sample race female: egen ols_total_bw = total(bw)
sort sample race female aafqt_bin3 aafqt
by sample race female aafqt_bin3: gen ols_run_bw_bin3 = ols_run_bw[_N]


// save
keep sample race female aafqt aafqt_bin3 ols_run_bw ols_run_bw_bin3 

foreach x in * {
	ren `x' `y'_`x'
}
foreach x in sample race female {
	ren `y'_`x' `x' 
}

tempfile `y'_merged
save ``y'_merged'

}

// merge 
use `res1_merged'
append using `res2_merged'



/*  ------------------------------------------------------------------------  */
/*            			      	 Figure G1               	 		          */
/*  ------------------------------------------------------------------------  */

foreach g in WM79 WM97 WW79 WW97 {
	local ols`g'_below = scalar(ols_wt_`g'_belowp50)
	local ols`g'_above = scalar(ols_wt_`g'_abovep50)
}

twoway (scatter res1_ols_run_bw_bin3 res1_aafqt_bin3 if  sample==0 & race==3 & female==0 & res1_aafqt_bin3!=149, mc(dkgreen) msymbol(circle_hollow) msize(small)) ///
	(scatter res1_ols_run_bw_bin3 res1_aafqt_bin3 if  sample==0 & race==3 & female==0 & res1_aafqt_bin3==149, mc(dkgreen) msymbol(triangle_hollow) msize(large)) ///
	(scatter res1_ols_run_bw_bin3 res1_aafqt_bin3 if  sample==1 & race==3 & female==0 & res1_aafqt_bin3!=152, mc(orange_red) msize(small)) ///
	(scatter res1_ols_run_bw_bin3 res1_aafqt_bin3 if  sample==1 & race==3 & female==0 & res1_aafqt_bin3==152, mc(orange_red) msymbol(triangle) msize(large)) ///
	(lowess res1_ols_run_bw res1_aafqt if  sample==0 & race==3 & female==0, lc(dkgreen) bw(0.3)) ///
	(lowess res1_ols_run_bw res1_aafqt if  sample==1 & race==3 & female==0, lc(orange_red) bw(0.3)) ///
	(scatter res2_ols_run_bw_bin3 res2_aafqt_bin3 if  sample==0 & race==3 & female==0 & res2_aafqt_bin3!=197, mc(dkgreen) msymbol(circle_hollow) msize(small)) ///
	(scatter res2_ols_run_bw_bin3 res2_aafqt_bin3 if  sample==0 & race==3 & female==0 & res2_aafqt_bin3==197, mc(dkgreen) msymbol(triangle_hollow) msize(large)) ///
	(scatter res2_ols_run_bw_bin3 res2_aafqt_bin3 if  sample==1 & race==3 & female==0 & res2_aafqt_bin3!=197, mc(orange_red) msize(small)) ///
	(scatter res2_ols_run_bw_bin3 res2_aafqt_bin3 if  sample==1 & race==3 & female==0 & res2_aafqt_bin3==197, mc(orange_red) msymbol(triangle) msize(large)) ///
	(lowess res2_ols_run_bw res2_aafqt if  sample==0 & race==3 & female==0, lc(dkgreen) bw(0.3)) ///
	(lowess res2_ols_run_bw res2_aafqt if  sample==1 & race==3 & female==0, lc(orange_red) bw(0.3)), ///
	legend(order(1 "NLSY-79: binned" 3 "NLSY-97: binned" 5 "NLSY-79: smoothed" 6 "NLSY-97: smoothed")) ///
	scheme(s1color) xtitle("Residualized AAFQT bin (rescaled to AAFQT level)") ytitle("Cumulative contribution to OLS") ///
	yline(`olsWM79_below',lc(dkgreen) lp(dot)) yline(`olsWM97_below',lc(orange_red) lp(dot)) ///
	yline(`olsWM79_above',lc(dkgreen) lp(dot)) yline(`olsWM97_above',lc(orange_red) lp(dot)) ///
	xsc(r(70 230)) xlabel(75(25)225) ysc(r(-0.7 1.3)) ylabel(-0.6(0.3)1.2)
	
graph export FigureG1_WM.pdf, replace
graph export FigureG1_WM.eps, replace
graph save FigureG1_WM, replace	
	
	
twoway (scatter res1_ols_run_bw_bin3 res1_aafqt_bin3 if  sample==0 & race==3 & female==1 & res1_aafqt_bin3!=152, mc(dkgreen) msymbol(circle_hollow) msize(small)) ///
	(scatter res1_ols_run_bw_bin3 res1_aafqt_bin3 if  sample==0 & race==3 & female==1 & res1_aafqt_bin3==152, mc(dkgreen) msymbol(triangle_hollow) msize(large)) ///
	(scatter res1_ols_run_bw_bin3 res1_aafqt_bin3 if  sample==1 & race==3 & female==1 & res1_aafqt_bin3!=155, mc(orange_red) msize(small)) ///
	(scatter res1_ols_run_bw_bin3 res1_aafqt_bin3 if  sample==1 & race==3 & female==1 & res1_aafqt_bin3==155, mc(orange_red) msymbol(triangle) msize(large)) ///
	(lowess res1_ols_run_bw res1_aafqt if  sample==0 & race==3 & female==1, lc(dkgreen) bw(0.3)) ///
	(lowess res1_ols_run_bw res1_aafqt if  sample==1 & race==3 & female==1, lc(orange_red) bw(0.3)) ///
	(scatter res2_ols_run_bw_bin3 res2_aafqt_bin3 if  sample==0 & race==3 & female==1 & res2_aafqt_bin3!=194, mc(dkgreen) msymbol(circle_hollow) msize(small)) ///
	(scatter res2_ols_run_bw_bin3 res2_aafqt_bin3 if  sample==0 & race==3 & female==1 & res2_aafqt_bin3==194, mc(dkgreen) msymbol(triangle_hollow) msize(large)) ///
	(scatter res2_ols_run_bw_bin3 res2_aafqt_bin3 if  sample==1 & race==3 & female==1 & res2_aafqt_bin3!=197, mc(orange_red) msize(small)) ///
	(scatter res2_ols_run_bw_bin3 res2_aafqt_bin3 if  sample==1 & race==3 & female==1 & res2_aafqt_bin3==197, mc(orange_red) msymbol(triangle) msize(large)) ///
	(lowess res2_ols_run_bw res2_aafqt if  sample==0 & race==3 & female==1, lc(dkgreen) bw(0.3)) ///
	(lowess res2_ols_run_bw res2_aafqt if  sample==1 & race==3 & female==1, lc(orange_red) bw(0.3)), ///
	legend(order(1 "NLSY-79: binned" 3 "NLSY-97: binned" 5 "NLSY-79: smoothed" 6 "NLSY-97: smoothed")) ///
	scheme(s1color) xtitle("Residualized AAFQT bin (rescaled to AAFQT level)") ytitle("Cumulative contribution to OLS") ///
	yline(`olsWW79_below',lc(dkgreen) lp(dot)) yline(`olsWW97_below',lc(orange_red) lp(dot)) ///
	yline(`olsWW79_above',lc(dkgreen) lp(dot)) yline(`olsWW97_above',lc(orange_red) lp(dot)) ///
	xsc(r(70 230)) xlabel(75(25)225) ysc(r(-0.7 1.3)) ylabel(-0.6(0.3)1.2)
	
graph export FigureG1_WW.pdf, replace
graph export FigureG1_WW.eps, replace
graph save FigureG1_WW, replace	





log close

exit



